-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[VPlan] Convert EVL loops to variable-length stepping after dissolution #147222
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Loop regions require fixed-length steps and rounded-up trip counts, but after dissolution creates explicit control flow, EVL loops can leverage variable-length stepping with original trip counts. This patch adds a post-dissolution transform pass to convert EVL loops from fixed-length to variable-length stepping .
@llvm/pr-subscribers-backend-risc-v @llvm/pr-subscribers-llvm-transforms Author: Shih-Po Hung (arcbbb) ChangesLoop regions require fixed-length steps and rounded-up trip counts, but after dissolution creates explicit control flow, EVL loops can leverage variable-length stepping with original trip counts. This patch adds a post-dissolution transform pass to convert EVL loops from fixed-length to variable-length stepping . Patch is 194.08 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/147222.diff 27 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 907839711a39c..5878ba4e8cb78 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7344,6 +7344,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// Regions are dissolved after optimizing for VF and UF, which completely
// removes unneeded loop regions first.
VPlanTransforms::dissolveLoopRegions(BestVPlan);
+ // Enable variable-length stepping for EVL loops after regions are dissolved
+ VPlanTransforms::simplifyEVLIVs(BestVPlan);
// Perform the actual loop transformation.
VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
OrigLoop->getParentLoop(),
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 8e05b0138eeed..f8944df0c6e01 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -2357,6 +2357,58 @@ bool VPlanTransforms::tryAddExplicitVectorLength(
return true;
}
+void VPlanTransforms::simplifyEVLIVs(VPlan &Plan) {
+ auto ConvertEVLPhi = [](VPlan &Plan, VPBasicBlock *Entry,
+ VPEVLBasedIVPHIRecipe *EVLPhi) {
+ using namespace llvm::VPlanPatternMatch;
+ VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
+
+ // Convert EVLPhi to concrete recipe.
+ auto *ScalarR = VPBuilder(EVLPhi).createScalarPhi(
+ {EVLPhi->getStartValue(), EVLIncrement}, EVLPhi->getDebugLoc(),
+ "evl.based.iv");
+ EVLPhi->replaceAllUsesWith(ScalarR);
+ EVLPhi->eraseFromParent();
+
+ // Find the latch-exiting block and convert to variable-length stepping.
+ // Before: (branch-on-cond CanonicalIVInc, VectorTripCount)
+ // After: (branch-on-cond EVLIVInc, TripCount)
+ auto FindLatchExiting = [](VPBasicBlock *Entry) {
+ auto Range =
+ VPBlockUtils::blocksOnly<VPBasicBlock>(vp_depth_first_shallow(Entry));
+ auto It = find_if(Range, [&](VPBasicBlock *VPBB) {
+ return any_of(VPBB->successors(),
+ [&](VPBlockBase *Succ) { return Succ == Entry; });
+ });
+ return It != Range.end() ? *It : nullptr;
+ };
+ VPBasicBlock *LatchExiting = FindLatchExiting(Entry);
+ assert(LatchExiting && "LatchExiting is not found");
+ auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
+ VPValue *ScalarIVInc;
+ if (!LatchExitingBr ||
+ !match(LatchExitingBr,
+ m_BranchOnCount(m_VPValue(ScalarIVInc),
+ m_Specific(&Plan.getVectorTripCount()))))
+ return;
+ LatchExitingBr->setOperand(1, Plan.getTripCount());
+ ScalarIVInc->replaceAllUsesWith(EVLIncrement);
+ VPRecipeBase *IVIncR = ScalarIVInc->getDefiningRecipe();
+ VPRecipeBase *ScalarIV = IVIncR->getOperand(0)->getDefiningRecipe();
+ IVIncR->eraseFromParent();
+ ScalarIV->eraseFromParent();
+ };
+
+ // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
+ vp_depth_first_shallow(Plan.getEntry())))
+ for (VPRecipeBase &R : VPBB->phis())
+ if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
+ ConvertEVLPhi(Plan, VPBB, PhiR);
+ break;
+ }
+}
+
void VPlanTransforms::dropPoisonGeneratingRecipes(
VPlan &Plan,
const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
@@ -2688,15 +2740,6 @@ void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan,
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
vp_depth_first_deep(Plan.getEntry()))) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
- if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
- auto *ScalarR = VPBuilder(PhiR).createScalarPhi(
- {PhiR->getStartValue(), PhiR->getBackedgeValue()},
- PhiR->getDebugLoc(), "evl.based.iv");
- PhiR->replaceAllUsesWith(ScalarR);
- ToRemove.push_back(PhiR);
- continue;
- }
-
if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
ToRemove.push_back(WidenIVR);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 8d2eded45da22..927ad860fc67b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -193,6 +193,17 @@ struct VPlanTransforms {
/// Replace loop regions with explicit CFG.
static void dissolveLoopRegions(VPlan &Plan);
+ /// Transform EVL loops to use variable-length stepping after region
+ /// dissolution.
+ ///
+ /// Once loop regions are replaced with explicit CFG, EVL loops can step with
+ /// variable vector lengths instead of fixed lengths. This transformation:
+ /// * EVL-Phi concretization (makes them concrete)
+ /// * Replaces fixed-length stepping (branch-on-cond CanonicalIVInc,
+ /// VectorTripCount) with variable-length stepping (branch-on-cond
+ /// EVLIVInc, TripCount).
+ static void simplifyEVLIVs(VPlan &Plan);
+
/// Lower abstract recipes to concrete ones, that can be codegen'd. Use \p
/// CanonicalIVTy as type for all un-typed live-ins in VPTypeAnalysis.
static void convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 38ada33d7ee19..7c160c3e0d597 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -192,7 +192,9 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
errs() << "EVL used by unexpected VPInstruction\n";
return false;
}
- if (I->getNumUsers() != 1) {
+ // EVLIVIncrement is only used by EVLIV & BranchOnCount.
+ // More than two is unexpected.
+ if (I->getNumUsers() > 2) {
errs() << "EVL is used in VPInstruction with multiple users\n";
return false;
}
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
index 25f52b2a99ddc..297786ae6bfa3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/evl-compatible-loops.ll
@@ -23,7 +23,6 @@ define void @test_wide_integer_induction(ptr noalias %a, i64 %N) {
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP10]]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_EVL_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_BODY]] ]
; CHECK-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
@@ -37,9 +36,8 @@ define void @test_wide_integer_induction(ptr noalias %a, i64 %N) {
; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VEC_IND]], ptr align 8 [[TMP15]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP11]] to i64
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[TMP8]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[FOR_COND_CLEANUP:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
index d485a7432423a..1f9e862aad118 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll
@@ -132,7 +132,6 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
; IF-EVL-OUTLOOP-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 4
; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL-OUTLOOP: vector.body:
-; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-OUTLOOP-NEXT: [[AVL:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
@@ -144,8 +143,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
; IF-EVL-OUTLOOP-NEXT: [[VP_OP:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP9]]
; IF-EVL-OUTLOOP-NEXT: [[TMP10]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[VP_OP]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP5]])
; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]]
-; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
-; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], [[N]]
; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; IF-EVL-OUTLOOP: middle.block:
; IF-EVL-OUTLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP10]])
@@ -188,7 +186,6 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
; IF-EVL-INLOOP-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 8
; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL-INLOOP: vector.body:
-; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-INLOOP-NEXT: [[TMP5:%.*]] = sub i32 [[N]], [[EVL_BASED_IV]]
@@ -200,8 +197,7 @@ define i32 @add_i16_i32(ptr nocapture readonly %x, i32 %n) {
; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = call i32 @llvm.vp.reduce.add.nxv8i32(i32 0, <vscale x 8 x i32> [[TMP14]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-INLOOP-NEXT: [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI]]
; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP6]], [[EVL_BASED_IV]]
-; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
-; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-INLOOP-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_EVL_NEXT]], [[N]]
; IF-EVL-INLOOP-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; IF-EVL-INLOOP: middle.block:
; IF-EVL-INLOOP-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
@@ -358,7 +354,6 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
; IF-EVL-OUTLOOP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; IF-EVL-OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL-OUTLOOP: vector.body:
-; IF-EVL-OUTLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-OUTLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[BROADCAST_SPLAT]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-OUTLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
@@ -371,8 +366,7 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
; IF-EVL-OUTLOOP-NEXT: [[TMP15]] = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> [[TMP14]], <vscale x 4 x i32> [[VEC_PHI]], i32 [[TMP9]])
; IF-EVL-OUTLOOP-NEXT: [[TMP16:%.*]] = zext i32 [[TMP9]] to i64
; IF-EVL-OUTLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP16]], [[EVL_BASED_IV]]
-; IF-EVL-OUTLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
-; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IF-EVL-OUTLOOP-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
; IF-EVL-OUTLOOP-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; IF-EVL-OUTLOOP: middle.block:
; IF-EVL-OUTLOOP-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.smin.nxv4i32(<vscale x 4 x i32> [[TMP15]])
@@ -409,7 +403,6 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
; IF-EVL-INLOOP-NEXT: [[TMP8:%.*]] = mul nuw i64 [[TMP7]], 4
; IF-EVL-INLOOP-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL-INLOOP: vector.body:
-; IF-EVL-INLOOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-INLOOP-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[START:%.*]], [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-INLOOP-NEXT: [[AVL:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]]
@@ -421,9 +414,8 @@ define i32 @smin(ptr %a, i64 %n, i32 %start) {
; IF-EVL-INLOOP-NEXT: [[RDX_MINMAX]] = call i32 @llvm.smin.i32(i32 [[TMP13]], i32 [[VEC_PHI]])
; IF-EVL-INLOOP-NEXT: [[TMP14:%.*]] = zext i32 [[TMP9]] to i64
; IF-EVL-INLOOP-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]]
-; IF-EVL-INLOOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
-; IF-EVL-INLOOP-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IF-EVL-INLOOP-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; IF-EVL-INLOOP-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]]
+; IF-EVL-INLOOP-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; IF-EVL-INLOOP: middle.block:
; IF-EVL-INLOOP-NEXT: br label [[FOR_END:%.*]]
; IF-EVL-INLOOP: scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
index 45357dd6bf0d6..b425ffadb3bf1 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-masked-access.ll
@@ -110,47 +110,38 @@ define void @masked_strided_factor2(ptr noalias nocapture readonly %p, ptr noali
; PREDICATED_EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; PREDICATED_EVL: vector.ph:
; PREDICATED_EVL-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; PREDICATED_EVL-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_EVL-NEXT: [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
-; PREDICATED_EVL-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1023
-; PREDICATED_EVL-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], [[TMP1]]
-; PREDICATED_EVL-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; PREDICATED_EVL-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_EVL-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
; PREDICATED_EVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
; PREDICATED_EVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_EVL-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_EVL-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
; PREDICATED_EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; PREDICATED_EVL: vector.body:
-; PREDICATED_EVL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_EVL-NEXT: [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; PREDICATED_EVL-NEXT: [[AVL:%.*]] = sub i32 1024, [[EVL_BASED_IV]]
-; PREDICATED_EVL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true)
-; PREDICATED_EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
+; PREDICATED_EVL-NEXT: [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i32(i32 [[AVL]], i32 16, i1 true)
+; PREDICATED_EVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
; PREDICATED_EVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; PREDICATED_EVL-NEXT: [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_EVL-NEXT: [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
-; PREDICATED_EVL-NEXT: [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP8]]
-; PREDICATED_EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP9]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT: [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP7]], splat (i32 1)
-; PREDICATED_EVL-NEXT: [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[P]], <vscale x 16 x i64> [[TMP11]]
-; PREDICATED_EVL-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP12]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT: [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i8> [[WIDE_MASKED_GATHER3]])
-; PREDICATED_EVL-NEXT: [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
-; PREDICATED_EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x ptr> align 1 [[TMP15]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT: [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
-; PREDICATED_EVL-NEXT: [[TMP17:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_EVL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP17]]
-; PREDICATED_EVL-NEXT: call void @llvm.vp.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> [[TMP16]], <vscale x 16 x ptr> align 1 [[TMP18]], <vscale x 16 x i1> [[TMP6]], i32 [[TMP5]])
-; PREDICATED_EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i32 [[TMP5]], [[EVL_BASED_IV]]
-; PREDICATED_EVL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
+; PREDICATED_EVL-NEXT: [[TMP2:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_EVL-NEXT: [[TMP3:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
+; PREDICATED_EVL-NEXT:...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the plan for this to replace #131005? From what I remember in that PR we wanted to keep the canonical IV around for the other vectorization passes since it's more amenable for analysis. Maybe things have changed in the meantime. cc @mshockwave
This is preparation work to enable uncountable EVL loops for speculative loads (#128593). This approach should be able to replace #131005 since both accomplish the same goal, but it performs earlier in the vectorizer pipeline rather than at the end. |
@fhahn while there have been earlier attempts to make the EVL loop uncountable, this is the first one undertaken after dissolving the structured loop form. Does this align with the intended VPlan design? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should be fine from the VPlan prespective, but I cannot really comment on whether the overall result is better/worse overall for RISCV.
Co-authored-by: Florian Hahn <[email protected]>
@@ -0,0 +1,68 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 | |||
; RUN: opt %s -S -mtriple riscv64 -passes=loop-vectorize --prefer-predicate-over-epilogue=predicate-dont-vectorize -force-tail-folding-style=data-with-evl -riscv-v-min-trip-count=0 -force-target-instruction-cost=1 -mattr=+v | FileCheck %s |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this test must be into to RISCV subdirectory.
@@ -0,0 +1,68 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For consistency with other related tests, would be good to call it vector-loop-backedge-elimination-with-evl.ll
or something like tat
|
||
// Replace CanonicalIVInc with EVL-PHI increment | ||
VPRecipeBase *CanonicalIV = &*Entry->begin(); | ||
assert(dyn_cast<VPPhi>(CanonicalIV) && "Unexpected canoincal iv"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we can use an isa to check
assert(dyn_cast<VPPhi>(CanonicalIV) && "Unexpected canoincal iv"); | |
assert(isa<VPPhi>(CanonicalIV) && "Unexpected canonical iv"); |
VPValue *Backedge = CanonicalIV->getOperand(1); | ||
Backedge->replaceAllUsesWith(EVLIncrement); | ||
|
||
// Remove unused phi |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit
// Remove unused phi | |
// Remove unused phi and increment |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM with the nits I posted earlier + Florian's comments. I tested this on TSVC too and most loops are unchanged, but there's a couple of places where we actually end up improving the loop e.g. in Reductions-dbl:
+ sub s1, s11, a6
+ sh2add a4, a6, s2
+ slli a0, a6, 10
.LBB14_7: # %vector.body
# Parent Loop BB14_3 Depth=1
# Parent Loop BB14_5 Depth=2
# => This Inner Loop Header: Depth=3
- add a0, a5, a2
+ sub a3, s1, a2
add a1, s0, a2
- add s1, a3, s6
- flw fa5, 0(a6)
- sub a0, s8, a0
- sh2add a1, a1, s3
- vsetvli a0, a0, e32, m2, ta, ma
- add s1, s1, a1
- vle32.v v8, (s1)
- vle32.v v10, (a1)
- sub a4, a4, s2
- vfnmsac.vf v10, fa5, v8
- vse32.v v10, (a1)
- add a2, a2, a0
- bnez a4, .LBB14_7
+ add a5, a0, s6
+ flw fa5, 0(a4)
+ vsetvli a3, a3, e32, m2, ta, ma
+ sh2add a1, a1, s2
+ add a5, a5, a1
+ vle32.v v8, (a1)
+ vle32.v v10, (a5)
+ vfnmsac.vf v8, fa5, v10
+ add a2, a2, a3
+ vse32.v v8, (a1)
+ bne a2, s1, .LBB14_7
j .LBB14_4
/// * Replaces fixed-length stepping (branch-on-cond CanonicalIVInc, | ||
/// VectorTripCount) with variable-length stepping (branch-on-cond | ||
/// EVLIVInc, TripCount). | ||
static void simplifyEVLIVs(VPlan &Plan); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure if the name is accurate, EVLIVs aren't really simplified, but the canonical IV is replaced by EVL-based phi?
if (I->getNumUsers() != 1) { | ||
// EVLIVIncrement is only used by EVLIV & BranchOnCount. | ||
// More than two is unexpected. | ||
if (I->getNumUsers() > 2) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This needs to verify that the other user is BranchOnCount?
if (!EVLPhi) | ||
return; | ||
|
||
VPBasicBlock *Entry = EVLPhi->getParent(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
VPBasicBlock *Entry = EVLPhi->getParent(); | |
VPBasicBlock *HeaderVPBB = EVLPhi->getParent(); |
EVLPhi = PhiR; | ||
} | ||
|
||
// Early return if no EVL PHI is found |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Early return if no EVL PHI is found | |
// Early return if no EVL PHI is found. |
@@ -193,6 +193,17 @@ struct VPlanTransforms { | |||
/// Replace loop regions with explicit CFG. | |||
static void dissolveLoopRegions(VPlan &Plan); | |||
|
|||
/// Transform EVL loops to use variable-length stepping after region |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This does more, it also converts EVL based phi recipe to plain scalar phi
|
||
// Replace CanonicalIVInc with EVL-PHI increment | ||
VPRecipeBase *CanonicalIV = &*Entry->begin(); | ||
assert(dyn_cast<VPPhi>(CanonicalIV) && "Unexpected canoincal iv"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should probably at least also check if the backedge value is Add, CanonicalIV, VFxUF
// After: (branch-on-count EVLIVInc, TripCount) | ||
auto Range = | ||
VPBlockUtils::blocksOnly<VPBasicBlock>(vp_depth_first_shallow(Entry)); | ||
auto It = find_if(Range, [&Entry](VPBasicBlock *VPBB) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
looking for the latch could be done by just getting the second predecessor of the header?
|
||
auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator()); | ||
|
||
// Skip single-iteration loop region |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// Skip single-iteration loop region | |
// Skip single-iteration loop region. |
Loop regions require fixed-length steps and rounded-up trip counts, but after dissolution creates explicit control flow, EVL loops can leverage variable-length stepping with original trip counts.
This patch adds a post-dissolution transform pass to convert EVL loops from fixed-length to variable-length stepping .